from bertopic import BERTopic
topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
import pandas as pd
df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
| Topic | Name | Top_n_words | Probability | Representative_document | |
|---|---|---|---|---|---|
| 0 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.098617 | False |
| 1 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.186230 | False |
| 2 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.397535 | False |
| 3 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.186223 | False |
| 4 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.154408 | False |
| ... | ... | ... | ... | ... | ... |
| 63023 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.957031 | False |
| 63024 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.958408 | False |
| 63025 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.470342 | False |
| 63026 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.151238 | False |
| 63027 | -1 | -1_energy_development_management_measure | energy - development - management - measure - ... | 0.247359 | False |
63028 rows × 5 columns
counts = {}
for doc in docs:
for word in doc.split():
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
energy 32557 land 26785 development22243 plan 16386 forest 16345 national 15246 agricultural14260 management14175 environmental14088 activity 13607 public 13302 establish 13100 purpose 12904 article 12576 protection11571 policy 10876 resource 10656 project 10645 sector 10475 measure 10026 production 9984 system 9774 power 9735 product 9501 set 9461 provide 9039 emission 8984 water 8951 environment 8878 procedure 8671 include 8475 natural 8443 consist 8282 regulation 8186 gas 8008 promote 7947 service 7866 condition 7784 support 7747 requirement 7626 control 7420 renewable 7393 electricity 7231 application 7224 sustainable 7014 organization 6967 implementation 6950 aim 6861 rule 6731 ensure 6729 efficiency 6713 economic 6623 government 6618 regulate 6473 carry 6467 standard 6391 establishes 6336 relate 6250 legal 6119 plant 6115 operation 6014 objective 5929 grant 5924 implement 5871 rural 5809 authority 5751 person 5731 program 5658 process 5527 building 5515 market 5426 action 5421 strategy 5383 create 5326 local 5293 `` 5242 function 5187 minister 5171 right 5170 tax 5153 waste 5143 quality 5101 property 5085 conservation 5071 text 5037 fuel 4956 increase 4947 investment 4932 agreement 4904 source 4841 technical 4828 supply 4799 form 4736 fund 4733 divide 4693 framework 4690 registration 4664 issue 4662 protect 4651 reduce 4640
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
[('grassland', 0.7370147131588162),
('permanent', 0.16952735063090357),
('supplementary', 0.08571556527024189),
('prohibition', 0.06655119250597118),
('liability', 0.05962473348425665),
('implement', 0.05679452995567585),
('ordinance', 0.04459698230176548),
('farming', 0.03570748090982378),
('afforestation', 0.034822982723819756),
('reforestation', 0.033169812063301087)]
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
[('fishery', 0.13944542494364653),
('agriculture', 0.07943544997415505),
('aquatic', 0.05477908036914138),
('fishing', 0.05304962776556011),
('fisheries', 0.04572184777019522),
('aquaculture', 0.037815564170875074),
('forestry', 0.037434174371047554),
('shellfish', 0.023419507524976874),
('concern', 0.02104295450374762),
('habitat', 0.018744302196177558)]
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
[('biodiversity', 0.18095997991522123),
('biological', 0.04911069515872883),
('sectoral', 0.024109475728398406),
('conserve', 0.023615540593107116),
('strategic', 0.022708791554987603),
('specie', 0.020704883791587),
('objective', 0.019437590501399926),
('management', 0.018636830995872957),
('genetic', 0.01731194493947348),
('equitable', 0.017097970345597457)]
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
[('energy', 0.010344741577772185),
('management', 0.008305760013746135),
('development', 0.008278847252110371),
('mineral', 0.007564087707767394),
('resource', 0.00745341265083421),
('protection', 0.007203174117237951),
('property', 0.006589749077854606),
('vehicle', 0.0062958130351013514),
('timber', 0.006279371535782798),
('exploration', 0.006189109348312551)]
len(docs)
63028
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
os.makedirs(images_path)
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
fig = topic_model.visualize_barchart(top_n_topics=20, n_words=10, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.svg')
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_heatmap.svg')
fig2
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_visualize_topics.svg')
fig3
hierarchical_topics = topic_model.hierarchical_topics(docs)
# print(hierarchical_topics)
with pd.ExcelWriter("Topic_hierarchical_topics.xlsx", engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
hierarchical_topics.to_excel(writer)
hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_hierarchical_topics.xlsx")
fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_hierarchical_topics.svg')
fig4
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [20:48<00:00, 21.53s/it]
for index, i in enumerate(timestamp):
if i == '0':
timestamp[index] = '2020'
else:
timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_topics_over_time.xlsx", engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
topics_over_time.to_excel(writer)
19it [2:36:39, 494.73s/it]
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_topics_over_time.xlsx")
fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_visualize_topics_over_time.svg')
fig5